Show all the columns that has not relevant information to analyze
(empty)
## [1] "height_cm_at_diagnosis"
## [2] "weight_kg_at_diagnosis"
## [3] "cause_of_death"
## [4] "cause_of_death_other"
## [5] "tobacco_smoking_age_started"
## [6] "hiv_status"
## [7] "nadir_cd4_counts"
## [8] "cd4_counts_at_diagnosis"
## [9] "hiv_rna_load_at_diagnosis"
## [10] "prior_aids_conditions"
## [11] "hbv_test_results"
## [12] "hcv_test_results"
## [13] "hpv_test_results"
## [14] "kshv_hhv8_test_results"
## [15] "haart_therapy_prior_to_dx"
## [16] "haart_therapy_at_dx"
## [17] "cdc_hiv_risk_group"
## [18] "prior_mailgnancy_type"
## [19] "history_immunological_disease"
## [20] "eml4_alk_translocation_variant"
## [21] "history_immunological_disease_other"
## [22] "history_immunosuppresive_rx"
## [23] "history_immunosuppressive_rx_other"
## [24] "history_relevant_infectious_dx"
## [25] "history_immunosuppresive_dx_other"
## [26] "laterality"
## [27] "method_initial_path_dx"
## [28] "method_initial_path_dx_other"
## [29] "lymph_nodes_examined"
## [30] "lymph_nodes_examined_count"
## [31] "lymph_nodes_examined_he_count"
## [32] "lymph_nodes_examined_ihc_count"
## [33] "pos_lymph_node_location"
## [34] "other_pos_node_location"
## [35] "ajcc_tumor_clinical_ct"
## [36] "ajcc_nodes_clinical_cn"
## [37] "ajcc_metastasis_clinical_cm"
## [38] "ajcc_clinical_tumor_stage"
## [39] "followup_lost_to"
## [40] "cancer_diagnosis_cancer_type_icd9_text_name"
## [41] "days_to_form_completion"
## [42] "days_to_hiv_diagnosis"
## [43] "days_to_patient_progression_free"
## [44] "days_to_sample_procurement"
## [45] "days_to_tumor_progression"
## [46] "egfr_mutation_identified"
## [47] "eml4_alk_translocation_identified"
## [48] "extranodal_involvement"
## [49] "family_member_relationship_type"
## [50] "margin_status"
## [51] "measure_of_response"
## [52] "metastatic_site_at_diagnosis"
## [53] "metastatic_site_at_diagnosis_other"
## [54] "number_cycles"
## [55] "pharm_regimen"
## [56] "pharm_regimen_other"
## [57] "prior_systemic_therapy_type"
## [58] "regimen_indication"
## [59] "relative_family_cancer_history"
## [60] "stage_other"
## [61] "stem_cell_transplantation"
## Create the individual plots of age, Vital status and gender
plot_age <- ggplot(clinical_data_filter_col, aes(x = age_at_initial_pathologic_diagnosis)) +
geom_histogram(binwidth = 10, color = "black", fill = "#7AC5CD", alpha = 0.7) +
labs(title = "Age Distribution of LUAD Patients", x = "Age", y = "Abs. Frequency") +
theme_minimal() +
theme(
axis.title = element_text(size = 12, face = "bold", angle = 0, hjust = 0.5),
title = element_text(size = 12, face = "bold", angle = 0, hjust = 0.5),
axis.text = element_text(size = 12, face = "bold", angle = 0, hjust = 0.5)
)
plot_vital_status <- ggplot(clinical_data_filter_col, aes(x = vital_status)) +
geom_bar(fill = c("#66CDAA", "#CD1076"), color = "black") +
labs(title = "Vital Status of LUAD Patients", x = "Vital Status", y = "Total") +
theme_minimal() +
theme(
axis.title = element_text(size = 12, face = "bold", angle = 0, hjust = 0.5),
title = element_text(size = 12, face = "bold", angle = 0, hjust = 0.5),
axis.text = element_text(size = 12, face = "bold", angle = 0, hjust = 0.5)
)
plot_gender <- ggplot(clinical_data_filter_col, aes(x = gender)) +
geom_bar(fill = c("#9A32CD", "#1874CD"), color = "black") +
labs(title = "Gender Distribution of LUAD Patients", x = "Gender", y = "Total") +
theme_minimal() +
theme(
axis.title = element_text(size = 12, face = "bold", angle = 0, hjust = 0.5),
title = element_text(size = 12, face = "bold", angle = 0, hjust = 0.5),
axis.text = element_text(size = 12, face = "bold", angle = 0, hjust = 0.5)
)
# Arrange the plots in a grid
grid_plot <- as.ggplot(grid.arrange(plot_age, plot_vital_status, plot_gender, ncol = 2))

ggsave('./Grid_plots.png',plot = grid_plot,width = 2200,height = 1600,units = 'px')
ggplot(clinical_data_filter_col, aes(x = age_at_initial_pathologic_diagnosis, fill = vital_status)) +
geom_histogram(binwidth = 10, color='black') +
labs(title = "Age Distribution Stratified by Gender and Vital Status", x = "Age", y = "Total") +
theme_minimal() +
theme(
axis.title.x = element_text(size = 14, face = "bold", angle = 0, hjust = 0.5),
axis.text.x = element_text(size = 12, angle = 45, hjust = 1),
axis.text.y = element_text(size = 12)
) +
facet_wrap(~ gender) + # Facet by gender
scale_fill_manual(values = c("#66CDAA", "#CD1076"))

# First step, filter data with NA in the column ajcc_pathologic_tumor_stage
clinical_data_violin <- clinical_data_filter_col %>%
filter(!is.na(ajcc_pathologic_tumor_stage))
# Violin plot to represent the age of the patients along AJCC tumor stage
ggplot(clinical_data_violin, aes(x = ajcc_pathologic_tumor_stage, y = age_at_initial_pathologic_diagnosis)) +
geom_violin(fill = "lightblue", color = "black", alpha = 0.7) +
geom_boxplot(width = 0.2, color = "black", alpha = 0.5, show.legend = FALSE) +
geom_jitter(aes(color = ajcc_pathologic_tumor_stage), width = 0.2, alpha = 0.6, size = 2) +
labs(title = "Age Distribution by AJCC Pathologic Tumor Stage", x = "AJCC Pathologic Tumor Stage", y = "Age") +
theme_minimal() +
theme(
axis.title = element_text(size = 14, face = "bold"),
axis.text.x = element_blank(),
axis.text.y = element_text(size = 12)
)

data <- table(clinical_data_filter_col$anatomic_organ_subdivision)
# Convert the regions and the absolute frequency into a dataframe
df <- data.frame(
Region = names(data),
Count = as.numeric(data)
)
# Pie chart of the distribution of Lung Regions with plotly (dinamic representation)
plot_ly(df,
labels = ~Region,
values = ~Count,
type = 'pie',
textinfo = 'label+percent',
hoverinfo = 'label+percent',
marker = list(colors = c('#66CDAA', '#CD1076', '#1E90FF', '#FFD700', '#8A2BE2'))) %>%
layout(
title = "Distribution of Lung Regions",
showlegend = TRUE
)
Survival analysis
Kaplan-Meier survival curves
Tumor stage (survival distributions between the groups are
significantly different)
fit <- survfit(Surv(survival_time, vital_status) ~ ajcc_pathologic_tumor_stage, data = clinical_data_survival)
ggsurvplot(fit, data = clinical_data_survival, pval = TRUE, risk.table = FALSE,
legend = c(0.93, 0.6),
legend.labs = c("Discrepancy","Stage I","Stage IA","Stage IB","Stage II","Stage IIA","Stage IIB", "Stage IIIA", "Stage IIIB", "Stage IV"),
legend.title = element_blank()
)

ggsurvplot(fit, data = clinical_data_survival, risk.table = TRUE,
legend.labs = c("Discrepancy","Stage I","Stage IA","Stage IB","Stage II","Stage IIA","Stage IIB", "Stage IIIA", "Stage IIIB", "Stage IV"),
legend.title = element_blank(),
risk.table.height = 1
)

EGFR mutation (survival distributions are the same)
fit_egfr <- survfit(Surv(survival_time, vital_status) ~ egfr_mutation_status, data = clinical_data_survival)
ggsurvplot(fit_egfr, data = clinical_data_survival, pval = TRUE)

KRAS mutation (survival distributions are the same)
fit_kras <- survfit(Surv(survival_time, vital_status) ~ kras_mutation_found, data = clinical_data_survival)
ggsurvplot(fit_kras, data = clinical_data_survival, pval = TRUE)

Cox Proportional Hazards Model
# Adjust the model
cox <- coxph(Surv(survival_time, vital_status) ~ age_at_initial_pathologic_diagnosis + ajcc_pathologic_tumor_stage + tobacco_smoking_pack_years_smoked, data = clinical_data_survival)
# summary(cox)
# Obtain the results, create a dataframe for the graphic and filter
summary_cox <- summary(cox)
hr_data <- data.frame(
Variable = rownames(summary_cox$coefficients),
HR = exp(summary_cox$coefficients[, "coef"]),
lower_ci = exp(summary_cox$conf.int[, "lower .95"]),
upper_ci = exp(summary_cox$conf.int[, "upper .95"]),
p_value = summary_cox$coefficients[, "Pr(>|z|)"]
)
hr_data_filtered <- hr_data %>% filter(p_value < 0.05)
# Forest plot (Only with relevant variables with significative effect on Death)
ggplot(hr_data_filtered, aes(x = HR, y = Variable)) +
geom_point(aes(color = p_value < 0.05), size = 4) +
geom_errorbarh(aes(xmin = lower_ci, xmax = upper_ci), height = 0.2) +
scale_x_log10() +
labs(
title = "Hazard Ratios (HR) de Supervivencia con Regresión de Cox",
x = "Hazard Ratio (HR)",
y = "Variables"
) +
theme_minimal() +
theme(
axis.title = element_text(size = 14, face = "bold"),
axis.text = element_text(size = 12),
title = element_text(size = 16, face = "bold"),
axis.text.x = element_text(size = 12, angle = 45, hjust = 1)
) +
geom_vline(xintercept = 1, linetype = "dashed", color = "red")
